knitr::opts_chunk$set(echo = TRUE) 
library(reticulate)
# use_python("/Users/oldemarrodriguez/anaconda3/bin/python3.7") # PROMIDAT
use_python("/anaconda3/bin/python3.6") ## Portátil

Árboles de Decisión

Paquetes necesarios

import os
import graphviz
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from   sklearn.model_selection import train_test_split
from   sklearn.tree import DecisionTreeClassifier
from   sklearn.metrics import confusion_matrix
from   sklearn.tree import export_graphviz

Se debe instalar el paquete graphviz

En Mac (Terminal):

pip install graphviz
conda install graphviz

En Windows (Anaconda Prompt):

pip install graphviz
conda install graphviz

Para mostrar un gráfico en un archivo rmd de un árbol

def graficar_arbol(grafico = None):
    grafico.format = "png"
    archivo  = grafico.render()
    img = mpimg.imread(archivo)
    imgplot = plt.imshow(img)
    plt.axis('off')
    plt.show()
    plt.close()

Índices para matrices NxN

def indices_general(MC, nombres = None):
    precision_global = np.sum(MC.diagonal()) / np.sum(MC)
    error_global = 1 - precision_global
    precision_categoria  = pd.DataFrame(MC.diagonal()/np.sum(MC,axis = 1)).T
    if nombres!=None:
        precision_categoria.columns = nombres
    return {"Matriz de Confusión":MC, 
            "Precisión Global":precision_global, 
            "Error Global":error_global, 
            "Precisión por Categoría":precision_categoria}

Ejemplo datos de Iris

os.chdir("/Users/oldemarrodriguez/Google Drive/MDCurso/Datos")
print(os.getcwd())
## /Users/oldemarrodriguez/Google Drive/MDCurso/Datos
datos = pd.read_csv('iris.csv',delimiter=';',decimal=".")
print(datos.shape)
## (150, 5)
print(datos.head())
##    s.largo  s.ancho  p.largo  p.ancho    tipo
## 0      5.1      3.5      1.4      0.2  setosa
## 1      4.9      3.0      1.4      0.2  setosa
## 2      4.7      3.2      1.3      0.2  setosa
## 3      4.6      3.1      1.5      0.2  setosa
## 4      5.0      3.6      1.4      0.2  setosa
print(datos.info())
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 150 entries, 0 to 149
## Data columns (total 5 columns):
## s.largo    150 non-null float64
## s.ancho    150 non-null float64
## p.largo    150 non-null float64
## p.ancho    150 non-null float64
## tipo       150 non-null object
## dtypes: float64(4), object(1)
## memory usage: 5.9+ KB
## None

Elimina la variable catégorica, deja las variables predictoras en X

X = datos.iloc[:,:4] 
print(X.head())
##    s.largo  s.ancho  p.largo  p.ancho
## 0      5.1      3.5      1.4      0.2
## 1      4.9      3.0      1.4      0.2
## 2      4.7      3.2      1.3      0.2
## 3      4.6      3.1      1.5      0.2
## 4      5.0      3.6      1.4      0.2

Deja la variable a predecir en y

y = datos.iloc[:,4:5] 
print(y.head())
##      tipo
## 0  setosa
## 1  setosa
## 2  setosa
## 3  setosa
## 4  setosa

Con el 70% de los datos para entrenamiento

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=0)
## /anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.
##   FutureWarning)
print(X_train)
##      s.largo  s.ancho  p.largo  p.ancho
## 60       5.0      2.0      3.5      1.0
## 116      6.5      3.0      5.5      1.8
## 144      6.7      3.3      5.7      2.5
## 119      6.0      2.2      5.0      1.5
## 108      6.7      2.5      5.8      1.8
## 69       5.6      2.5      3.9      1.1
## 135      7.7      3.0      6.1      2.3
## 56       6.3      3.3      4.7      1.6
## 80       5.5      2.4      3.8      1.1
## 123      6.3      2.7      4.9      1.8
## 133      6.3      2.8      5.1      1.5
## 106      4.9      2.5      4.5      1.7
## 146      6.3      2.5      5.0      1.9
## 50       7.0      3.2      4.7      1.4
## 147      6.5      3.0      5.2      2.0
## 85       6.0      3.4      4.5      1.6
## 30       4.8      3.1      1.6      0.2
## 101      5.8      2.7      5.1      1.9
## 94       5.6      2.7      4.2      1.3
## 64       5.6      2.9      3.6      1.3
## 89       5.5      2.5      4.0      1.3
## 91       6.1      3.0      4.6      1.4
## 125      7.2      3.2      6.0      1.8
## 48       5.3      3.7      1.5      0.2
## 13       4.3      3.0      1.1      0.1
## 111      6.4      2.7      5.3      1.9
## 95       5.7      3.0      4.2      1.2
## 20       5.4      3.4      1.7      0.2
## 15       5.7      4.4      1.5      0.4
## 52       6.9      3.1      4.9      1.5
## ..       ...      ...      ...      ...
## 14       5.8      4.0      1.2      0.2
## 122      7.7      2.8      6.7      2.0
## 19       5.1      3.8      1.5      0.3
## 29       4.7      3.2      1.6      0.2
## 130      7.4      2.8      6.1      1.9
## 49       5.0      3.3      1.4      0.2
## 136      6.3      3.4      5.6      2.4
## 99       5.7      2.8      4.1      1.3
## 82       5.8      2.7      3.9      1.2
## 79       5.7      2.6      3.5      1.0
## 115      6.4      3.2      5.3      2.3
## 145      6.7      3.0      5.2      2.3
## 72       6.3      2.5      4.9      1.5
## 77       6.7      3.0      5.0      1.7
## 25       5.0      3.0      1.6      0.2
## 81       5.5      2.4      3.7      1.0
## 140      6.7      3.1      5.6      2.4
## 142      5.8      2.7      5.1      1.9
## 39       5.1      3.4      1.5      0.2
## 58       6.6      2.9      4.6      1.3
## 88       5.6      3.0      4.1      1.3
## 70       5.9      3.2      4.8      1.8
## 87       6.3      2.3      4.4      1.3
## 36       5.5      3.5      1.3      0.2
## 21       5.1      3.7      1.5      0.4
## 9        4.9      3.1      1.5      0.1
## 103      6.3      2.9      5.6      1.8
## 67       5.8      2.7      4.1      1.0
## 117      7.7      3.8      6.7      2.2
## 47       4.6      3.2      1.4      0.2
## 
## [105 rows x 4 columns]
print(X_test)
##      s.largo  s.ancho  p.largo  p.ancho
## 114      5.8      2.8      5.1      2.4
## 62       6.0      2.2      4.0      1.0
## 33       5.5      4.2      1.4      0.2
## 107      7.3      2.9      6.3      1.8
## 7        5.0      3.4      1.5      0.2
## 100      6.3      3.3      6.0      2.5
## 40       5.0      3.5      1.3      0.3
## 86       6.7      3.1      4.7      1.5
## 76       6.8      2.8      4.8      1.4
## 71       6.1      2.8      4.0      1.3
## 134      6.1      2.6      5.6      1.4
## 51       6.4      3.2      4.5      1.5
## 73       6.1      2.8      4.7      1.2
## 54       6.5      2.8      4.6      1.5
## 63       6.1      2.9      4.7      1.4
## 37       4.9      3.1      1.5      0.1
## 78       6.0      2.9      4.5      1.5
## 90       5.5      2.6      4.4      1.2
## 45       4.8      3.0      1.4      0.3
## 16       5.4      3.9      1.3      0.4
## 121      5.6      2.8      4.9      2.0
## 66       5.6      3.0      4.5      1.5
## 24       4.8      3.4      1.9      0.2
## 8        4.4      2.9      1.4      0.2
## 126      6.2      2.8      4.8      1.8
## 22       4.6      3.6      1.0      0.2
## 44       5.1      3.8      1.9      0.4
## 97       6.2      2.9      4.3      1.3
## 93       5.0      2.3      3.3      1.0
## 26       5.0      3.4      1.6      0.4
## 137      6.4      3.1      5.5      1.8
## 84       5.4      3.0      4.5      1.5
## 27       5.2      3.5      1.5      0.2
## 127      6.1      3.0      4.9      1.8
## 132      6.4      2.8      5.6      2.2
## 59       5.2      2.7      3.9      1.4
## 18       5.7      3.8      1.7      0.3
## 83       6.0      2.7      5.1      1.6
## 61       5.9      3.0      4.2      1.5
## 92       5.8      2.6      4.0      1.2
## 112      6.8      3.0      5.5      2.1
## 2        4.7      3.2      1.3      0.2
## 141      6.9      3.1      5.1      2.3
## 43       5.0      3.5      1.6      0.6
## 10       5.4      3.7      1.5      0.2
print(y_train)
##            tipo
## 60   versicolor
## 116   virginica
## 144   virginica
## 119   virginica
## 108   virginica
## 69   versicolor
## 135   virginica
## 56   versicolor
## 80   versicolor
## 123   virginica
## 133   virginica
## 106   virginica
## 146   virginica
## 50   versicolor
## 147   virginica
## 85   versicolor
## 30       setosa
## 101   virginica
## 94   versicolor
## 64   versicolor
## 89   versicolor
## 91   versicolor
## 125   virginica
## 48       setosa
## 13       setosa
## 111   virginica
## 95   versicolor
## 20       setosa
## 15       setosa
## 52   versicolor
## ..          ...
## 14       setosa
## 122   virginica
## 19       setosa
## 29       setosa
## 130   virginica
## 49       setosa
## 136   virginica
## 99   versicolor
## 82   versicolor
## 79   versicolor
## 115   virginica
## 145   virginica
## 72   versicolor
## 77   versicolor
## 25       setosa
## 81   versicolor
## 140   virginica
## 142   virginica
## 39       setosa
## 58   versicolor
## 88   versicolor
## 70   versicolor
## 87   versicolor
## 36       setosa
## 21       setosa
## 9        setosa
## 103   virginica
## 67   versicolor
## 117   virginica
## 47       setosa
## 
## [105 rows x 1 columns]
print(y_test)
##            tipo
## 114   virginica
## 62   versicolor
## 33       setosa
## 107   virginica
## 7        setosa
## 100   virginica
## 40       setosa
## 86   versicolor
## 76   versicolor
## 71   versicolor
## 134   virginica
## 51   versicolor
## 73   versicolor
## 54   versicolor
## 63   versicolor
## 37       setosa
## 78   versicolor
## 90   versicolor
## 45       setosa
## 16       setosa
## 121   virginica
## 66   versicolor
## 24       setosa
## 8        setosa
## 126   virginica
## 22       setosa
## 44       setosa
## 97   versicolor
## 93   versicolor
## 26       setosa
## 137   virginica
## 84   versicolor
## 27       setosa
## 127   virginica
## 132   virginica
## 59   versicolor
## 18       setosa
## 83   versicolor
## 61   versicolor
## 92   versicolor
## 112   virginica
## 2        setosa
## 141   virginica
## 43       setosa
## 10       setosa

Mediante el constructor inicializa la instancia_arbol

instancia_arbol = DecisionTreeClassifier(random_state=0)

Entrena el modelo llamando al método fit

Nota:Esto se debe a que al ser Python orientado a Objetos,el modelo queda en un atributo de la instancia “instancia_arbol” llama el método fit de cla clase DecisionTreeClassifier.

instancia_arbol.fit(X_train,y_train)

Imprime las predicciones en testing

print("Las predicciones en Testing son: {}".format(instancia_arbol.predict(X_test)))
## Las predicciones en Testing son: ['virginica' 'versicolor' 'setosa' 'virginica' 'setosa' 'virginica'
##  'setosa' 'versicolor' 'versicolor' 'versicolor' 'virginica' 'versicolor'
##  'versicolor' 'versicolor' 'versicolor' 'setosa' 'versicolor' 'versicolor'
##  'setosa' 'setosa' 'virginica' 'versicolor' 'setosa' 'setosa' 'virginica'
##  'setosa' 'setosa' 'versicolor' 'versicolor' 'setosa' 'virginica'
##  'versicolor' 'setosa' 'virginica' 'virginica' 'versicolor' 'setosa'
##  'virginica' 'versicolor' 'versicolor' 'virginica' 'setosa' 'virginica'
##  'setosa' 'setosa']

Porcentaje de predicción global

print("Precisión en Testing: {:.3f}".format(instancia_arbol.score(X_test, y_test)))
## Precisión en Testing: 0.978

Matriz de confusión

prediccion = instancia_arbol.predict(X_test)
MC = confusion_matrix(y_test, prediccion)
print("Matriz de Confusión:\n{}".format(MC))
## Matriz de Confusión:
## [[16  0  0]
##  [ 0 17  1]
##  [ 0  0 11]]

Índices de Calidad del Modelo

indices = indices_general(MC,list(np.unique(y)))
for k in indices:
    print("\n%s:\n%s"%(k,str(indices[k])))
## 
## Matriz de Confusión:
## [[16  0  0]
##  [ 0 17  1]
##  [ 0  0 11]]
## 
## Precisión Global:
## 0.9777777777777777
## 
## Error Global:
## 0.022222222222222254
## 
## Precisión por Categoría:
##    setosa  versicolor  virginica
## 0     1.0    0.944444        1.0

Graficando el árbol

dot_data = export_graphviz(instancia_arbol, out_file=None,class_names=["Setosa", "Virginica", "Versicolor"],
                feature_names=list(X.columns.values), filled=True)
grafico = graphviz.Source(dot_data) 
graficar_arbol(grafico)

Ejemplo Scoring de Crédito

os.chdir("/Users/oldemarrodriguez/Google Drive/MDCurso/Datos")
print(os.getcwd())
## /Users/oldemarrodriguez/Google Drive/MDCurso/Datos
datos = pd.read_csv('MuestraCredito5000.csv',delimiter=';',decimal=".")
print(datos.shape)
## (5000, 6)
print(datos.head())
##    MontoCredito  IngresoNeto     ...       GradoAcademico  BuenPagador
## 0             1            1     ...                    1           Si
## 1             3            1     ...                    1           Si
## 2             2            1     ...                    1           Si
## 3             1            2     ...                    1           Si
## 4             1            1     ...                    1           Si
## 
## [5 rows x 6 columns]
print(datos.info())
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 5000 entries, 0 to 4999
## Data columns (total 6 columns):
## MontoCredito         5000 non-null int64
## IngresoNeto          5000 non-null int64
## CoefCreditoAvaluo    5000 non-null int64
## MontoCuota           5000 non-null int64
## GradoAcademico       5000 non-null int64
## BuenPagador          5000 non-null object
## dtypes: int64(5), object(1)
## memory usage: 234.5+ KB
## None

Nota: Está tomando erroneamente los datos como numéricos, en este caso se deben convertir las variables ccategóricas porque en realidad estos numéros son códigos NO es que siempre se deban convertir las variables numéricas a categórica.

datos['MontoCredito'] = datos['MontoCredito'].astype('category')
datos['IngresoNeto'] = datos['IngresoNeto'].astype('category')
datos['CoefCreditoAvaluo'] = datos['CoefCreditoAvaluo'].astype('category')
datos['MontoCuota'] = datos['MontoCuota'].astype('category')
datos['GradoAcademico'] = datos['GradoAcademico'].astype('category')
print(datos.head())
##   MontoCredito IngresoNeto     ...     GradoAcademico BuenPagador
## 0            1           1     ...                  1          Si
## 1            3           1     ...                  1          Si
## 2            2           1     ...                  1          Si
## 3            1           2     ...                  1          Si
## 4            1           1     ...                  1          Si
## 
## [5 rows x 6 columns]
print(datos.info())
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 5000 entries, 0 to 4999
## Data columns (total 6 columns):
## MontoCredito         5000 non-null category
## IngresoNeto          5000 non-null category
## CoefCreditoAvaluo    5000 non-null category
## MontoCuota           5000 non-null category
## GradoAcademico       5000 non-null category
## BuenPagador          5000 non-null object
## dtypes: category(5), object(1)
## memory usage: 64.5+ KB
## None

Elimina la variable categórica, deja las variables predictoras en X

X = datos.iloc[:,:5] 
print(X.head())
##   MontoCredito IngresoNeto      ...       MontoCuota GradoAcademico
## 0            1           1      ...                1              1
## 1            3           1      ...                1              1
## 2            2           1      ...                1              1
## 3            1           2      ...                1              1
## 4            1           1      ...                1              1
## 
## [5 rows x 5 columns]

Deja la variable a predecir en y

y = datos.iloc[:,5:6] 
print(y.head())
##   BuenPagador
## 0          Si
## 1          Si
## 2          Si
## 3          Si
## 4          Si

Con el 75% de los datos para entrenamiento

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=0)
## /anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.
##   FutureWarning)

Mediante el constructor inicializa la instancia

instancia_arbol = DecisionTreeClassifier(random_state=0)
instancia_arbol.fit(X_train,y_train)

Imprime las predicciones en testing

print("Las predicciones en Testing son: {}".format(instancia_arbol.predict(X_test)))
## Las predicciones en Testing son: ['Si' 'Si' 'Si' ... 'Si' 'Si' 'No']

Porcentaje de predicción global

print("Precisión en Testing: {:.2f}".format(instancia_arbol.score(X_test, y_test)))
## Precisión en Testing: 0.95

Matriz de confusión

prediccion = instancia_arbol.predict(X_test)
MC = confusion_matrix(y_test, prediccion)
print("Matriz de Confusión:\n{}".format(MC))
## Matriz de Confusión:
## [[ 119   47]
##  [  13 1071]]

Índices de Calidad del Modelo

indices = indices_general(MC,list(np.unique(y)))
for k in indices:
    print("\n%s:\n%s"%(k,str(indices[k])))
## 
## Matriz de Confusión:
## [[ 119   47]
##  [  13 1071]]
## 
## Precisión Global:
## 0.952
## 
## Error Global:
## 0.04800000000000004
## 
## Precisión por Categoría:
##          No        Si
## 0  0.716867  0.988007

Graficando el árbol

dot_data = export_graphviz(instancia_arbol, out_file=None,class_names=["No", "Si"],
                feature_names=list(X.columns.values), filled=True)
grafico = graphviz.Source(dot_data) 
graficar_arbol(grafico)

Nota: El árbol es demasiado grande porque min_samples_leaf=1

Vamos a podar el árbol, puede ser que la calidad de la predicción sea menor

instancia_arbol = DecisionTreeClassifier(random_state=0,min_samples_leaf=150)
print(instancia_arbol)
## DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
##             max_features=None, max_leaf_nodes=None,
##             min_impurity_decrease=0.0, min_impurity_split=None,
##             min_samples_leaf=150, min_samples_split=2,
##             min_weight_fraction_leaf=0.0, presort=False, random_state=0,
##             splitter='best')

Entrena el modelo llamando al método fit

Observe que no hay variable que guarde el modelo como en R, esto se debe a que al ser Python orientado a Objetos, el modelo queda en un atributo de la instancia “instancia_arbol” por defecto:

instancia_arbol.fit(X_train,y_train)

Imprime las predicciones en testing

print("Las predicciones en Testing son: {}".format(instancia_arbol.predict(X_test)))
## Las predicciones en Testing son: ['Si' 'Si' 'No' ... 'Si' 'Si' 'No']

Porcentaje de predicción global

print("Precisión en Testing: {:.2f}".format(instancia_arbol.score(X_test, y_test)))
## Precisión en Testing: 0.92

Matriz de confusión

prediccion = instancia_arbol.predict(X_test)
MC = confusion_matrix(y_test, prediccion)
print("Matriz de Confusión:\n{}".format(MC))
## Matriz de Confusión:
## [[  90   76]
##  [  26 1058]]

Índices de calidad

indices = indices_general(MC,list(np.unique(y)))
for k in indices:
    print("\n%s:\n%s"%(k,str(indices[k])))
## 
## Matriz de Confusión:
## [[  90   76]
##  [  26 1058]]
## 
## Precisión Global:
## 0.9184
## 
## Error Global:
## 0.0816
## 
## Precisión por Categoría:
##          No        Si
## 0  0.542169  0.976015

Graficando el árbol

dot_data = export_graphviz(instancia_arbol, out_file=None,class_names=["No", "Si"],
                feature_names=list(X.columns.values), filled=True)
grafico = graphviz.Source(dot_data)
graficar_arbol(grafico)

Parámetros más importantes:

class sklearn.tree.DecisionTreeClassifier(criterion=’gini’, splitter=’best’, max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=False)[source]

criterion: string, optional (default=”gini”) The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain.

random_state: int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator.

max_depth: int or None, optional (default=None) The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.

min_samples_split: int, float, optional (default=2) The minimum number of samples required to split an internal node:

min_samples_leaf: int, float, optional (default=1) The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression.

max_features: int, float, string or None, optional (default=None) The number of features to consider when looking for the best split:

If int, then consider max_features features at each split. If float, then max_features is a fraction and int(max_features * n_features) features are considered at each split. If “auto”, then max_features=sqrt(n_features). If “sqrt”, then max_features=sqrt(n_features). If “log2”, then max_features=log2(n_features). If None, then max_features=n_features. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than max_features features.

max_leaf_nodes: int or None, optional (default=None) Grow a tree with max_leaf_nodes in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.

min_impurity_decrease: float, optional (default=0.) A node will be split if this split induces a decrease of the impurity greater than or equal to this value.

Atributos más importantes:

classes_: array of shape = [n_classes] or a list of such arrays The classes labels (single output problem), or a list of arrays of class labels (multi-output problem).

feature_importances_: array of shape = [n_features] Return the feature importances.

n_classes_: int or list The number of classes (for single output problems), or a list containing the number of classes for each output (for multi-output problems).

n_features_: int The number of features when fit is performed.

Vamos a construir el árbol con Entropia para ver si mejora la calidad de la predicción

instancia_arbol = DecisionTreeClassifier(criterion="entropy",)
print(instancia_arbol)
## DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
##             max_features=None, max_leaf_nodes=None,
##             min_impurity_decrease=0.0, min_impurity_split=None,
##             min_samples_leaf=1, min_samples_split=2,
##             min_weight_fraction_leaf=0.0, presort=False, random_state=None,
##             splitter='best')

Entrena el modelo llamando al método fit

Observe que no hay variable que guarde el modelo como en R, esto se debe a que al ser Python orientado a Objetos, el modelo queda en un atributo de la instancia “instancia_arbol” por defecto:

instancia_arbol.fit(X_train,y_train)

Imprime las predicciones en testing

print("Las predicciones en Testing son: {}".format(instancia_arbol.predict(X_test)))
## Las predicciones en Testing son: ['Si' 'Si' 'No' ... 'Si' 'Si' 'No']

Porcentaje de predicción global

print("Precisión en Testing: {:.2f}".format(instancia_arbol.score(X_test, y_test)))
## Precisión en Testing: 0.95

Matriz de confusión

prediccion = instancia_arbol.predict(X_test)
MC = confusion_matrix(y_test, prediccion)
print("Matriz de Confusión:\n{}".format(MC))
## Matriz de Confusión:
## [[ 120   46]
##  [  13 1071]]

Índices de calidad

indices = indices_general(MC,list(np.unique(y)))
for k in indices:
    print("\n%s:\n%s"%(k,str(indices[k])))
## 
## Matriz de Confusión:
## [[ 120   46]
##  [  13 1071]]
## 
## Precisión Global:
## 0.9528
## 
## Error Global:
## 0.04720000000000002
## 
## Precisión por Categoría:
##          No        Si
## 0  0.722892  0.988007

Graficando el árbol

dot_data = export_graphviz(instancia_arbol, out_file=None,class_names=["No", "Si"],
                feature_names=list(X.columns.values), filled=True)
grafico = graphviz.Source(dot_data)
graficar_arbol(grafico)

Ejemplo con SAHeart.csv

Se deben recodificar las categorías con números

os.chdir("/Users/oldemarrodriguez/Google Drive/MDCurso/Datos")
print(os.getcwd())
## /Users/oldemarrodriguez/Google Drive/MDCurso/Datos
datos = pd.read_csv('SAheart.csv',delimiter=';',decimal=".")
print(datos.shape)
## (462, 10)
print(datos.info())
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 462 entries, 0 to 461
## Data columns (total 10 columns):
## sbp          462 non-null int64
## tobacco      462 non-null float64
## ldl          462 non-null float64
## adiposity    462 non-null float64
## famhist      462 non-null object
## typea        462 non-null int64
## obesity      462 non-null float64
## alcohol      462 non-null float64
## age          462 non-null int64
## chd          462 non-null object
## dtypes: float64(5), int64(3), object(2)
## memory usage: 36.2+ KB
## None
print(datos.head())
# Convierte las variables de object a categórica
##    sbp  tobacco   ldl  adiposity  famhist  typea  obesity  alcohol  age chd
## 0  160    12.00  5.73      23.11  Present     49    25.30    97.20   52  Si
## 1  144     0.01  4.41      28.61   Absent     55    28.87     2.06   63  Si
## 2  118     0.08  3.48      32.28  Present     52    29.14     3.81   46  No
## 3  170     7.50  6.41      38.03  Present     51    31.99    24.26   58  Si
## 4  134    13.60  3.50      27.78  Present     60    25.99    57.34   49  Si
datos['famhist'] = datos['famhist'].astype('category')
print(datos.info())
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 462 entries, 0 to 461
## Data columns (total 10 columns):
## sbp          462 non-null int64
## tobacco      462 non-null float64
## ldl          462 non-null float64
## adiposity    462 non-null float64
## famhist      462 non-null category
## typea        462 non-null int64
## obesity      462 non-null float64
## alcohol      462 non-null float64
## age          462 non-null int64
## chd          462 non-null object
## dtypes: category(1), float64(5), int64(3), object(1)
## memory usage: 33.1+ KB
## None
print(datos.head())
# Recodifica las categorías usando numéros
##    sbp  tobacco   ldl  adiposity  famhist  typea  obesity  alcohol  age chd
## 0  160    12.00  5.73      23.11  Present     49    25.30    97.20   52  Si
## 1  144     0.01  4.41      28.61   Absent     55    28.87     2.06   63  Si
## 2  118     0.08  3.48      32.28  Present     52    29.14     3.81   46  No
## 3  170     7.50  6.41      38.03  Present     51    31.99    24.26   58  Si
## 4  134    13.60  3.50      27.78  Present     60    25.99    57.34   49  Si
datos["famhist"] = datos["famhist"].cat.codes
print(datos.info())
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 462 entries, 0 to 461
## Data columns (total 10 columns):
## sbp          462 non-null int64
## tobacco      462 non-null float64
## ldl          462 non-null float64
## adiposity    462 non-null float64
## famhist      462 non-null int8
## typea        462 non-null int64
## obesity      462 non-null float64
## alcohol      462 non-null float64
## age          462 non-null int64
## chd          462 non-null object
## dtypes: float64(5), int64(3), int8(1), object(1)
## memory usage: 33.0+ KB
## None
print(datos.head())
# Convierte las variables de entero a categórica
##    sbp  tobacco   ldl  adiposity  famhist  typea  obesity  alcohol  age chd
## 0  160    12.00  5.73      23.11        1     49    25.30    97.20   52  Si
## 1  144     0.01  4.41      28.61        0     55    28.87     2.06   63  Si
## 2  118     0.08  3.48      32.28        1     52    29.14     3.81   46  No
## 3  170     7.50  6.41      38.03        1     51    31.99    24.26   58  Si
## 4  134    13.60  3.50      27.78        1     60    25.99    57.34   49  Si
datos['famhist'] = datos['famhist'].astype('category')
print(datos.info())
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 462 entries, 0 to 461
## Data columns (total 10 columns):
## sbp          462 non-null int64
## tobacco      462 non-null float64
## ldl          462 non-null float64
## adiposity    462 non-null float64
## famhist      462 non-null category
## typea        462 non-null int64
## obesity      462 non-null float64
## alcohol      462 non-null float64
## age          462 non-null int64
## chd          462 non-null object
## dtypes: category(1), float64(5), int64(3), object(1)
## memory usage: 33.1+ KB
## None
print(datos.head())
##    sbp  tobacco   ldl  adiposity famhist  typea  obesity  alcohol  age chd
## 0  160    12.00  5.73      23.11       1     49    25.30    97.20   52  Si
## 1  144     0.01  4.41      28.61       0     55    28.87     2.06   63  Si
## 2  118     0.08  3.48      32.28       1     52    29.14     3.81   46  No
## 3  170     7.50  6.41      38.03       1     51    31.99    24.26   58  Si
## 4  134    13.60  3.50      27.78       1     60    25.99    57.34   49  Si
X = datos.iloc[:,:9] 
print(X.head())
##    sbp  tobacco   ldl  adiposity famhist  typea  obesity  alcohol  age
## 0  160    12.00  5.73      23.11       1     49    25.30    97.20   52
## 1  144     0.01  4.41      28.61       0     55    28.87     2.06   63
## 2  118     0.08  3.48      32.28       1     52    29.14     3.81   46
## 3  170     7.50  6.41      38.03       1     51    31.99    24.26   58
## 4  134    13.60  3.50      27.78       1     60    25.99    57.34   49
y = datos.iloc[:,9:10] 
print(y.head())
##   chd
## 0  Si
## 1  Si
## 2  No
## 3  Si
## 4  Si
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, random_state=0)
## /anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.
##   FutureWarning)
instancia_arbol = DecisionTreeClassifier(criterion="gini")
print(instancia_arbol)
## DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
##             max_features=None, max_leaf_nodes=None,
##             min_impurity_decrease=0.0, min_impurity_split=None,
##             min_samples_leaf=1, min_samples_split=2,
##             min_weight_fraction_leaf=0.0, presort=False, random_state=None,
##             splitter='best')
instancia_arbol.fit(X_train,y_train)
prediccion = instancia_arbol.predict(X_test)
MC = confusion_matrix(y_test, prediccion)
indices = indices_general(MC,list(np.unique(y)))
for k in indices:
    print("\n%s:\n%s"%(k,str(indices[k])))
## 
## Matriz de Confusión:
## [[49 12]
##  [11 21]]
## 
## Precisión Global:
## 0.7526881720430108
## 
## Error Global:
## 0.24731182795698925
## 
## Precisión por Categoría:
##          No       Si
## 0  0.803279  0.65625
dot_data = export_graphviz(instancia_arbol, out_file=None,class_names=["No", "Si"],
                feature_names=list(X.columns.values), filled=True)
grafico = graphviz.Source(dot_data)
graficar_arbol(grafico)

Se deben recodificar las categorías con números

os.chdir("/Users/oldemarrodriguez/Google Drive/MDCurso/Datos")
print(os.getcwd())
## /Users/oldemarrodriguez/Google Drive/MDCurso/Datos
datos = pd.read_csv('SAheart.csv',delimiter=';',decimal=".")
print(datos.shape)
## (462, 10)
print(datos.info())
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 462 entries, 0 to 461
## Data columns (total 10 columns):
## sbp          462 non-null int64
## tobacco      462 non-null float64
## ldl          462 non-null float64
## adiposity    462 non-null float64
## famhist      462 non-null object
## typea        462 non-null int64
## obesity      462 non-null float64
## alcohol      462 non-null float64
## age          462 non-null int64
## chd          462 non-null object
## dtypes: float64(5), int64(3), object(2)
## memory usage: 36.2+ KB
## None
print(datos.head())
# Convierte las variables de object a categórica
##    sbp  tobacco   ldl  adiposity  famhist  typea  obesity  alcohol  age chd
## 0  160    12.00  5.73      23.11  Present     49    25.30    97.20   52  Si
## 1  144     0.01  4.41      28.61   Absent     55    28.87     2.06   63  Si
## 2  118     0.08  3.48      32.28  Present     52    29.14     3.81   46  No
## 3  170     7.50  6.41      38.03  Present     51    31.99    24.26   58  Si
## 4  134    13.60  3.50      27.78  Present     60    25.99    57.34   49  Si
datos['famhist'] = datos['famhist'].astype('category')
print(datos.info())
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 462 entries, 0 to 461
## Data columns (total 10 columns):
## sbp          462 non-null int64
## tobacco      462 non-null float64
## ldl          462 non-null float64
## adiposity    462 non-null float64
## famhist      462 non-null category
## typea        462 non-null int64
## obesity      462 non-null float64
## alcohol      462 non-null float64
## age          462 non-null int64
## chd          462 non-null object
## dtypes: category(1), float64(5), int64(3), object(1)
## memory usage: 33.1+ KB
## None
print(datos.head())
# Recodifica las categorías usando números
##    sbp  tobacco   ldl  adiposity  famhist  typea  obesity  alcohol  age chd
## 0  160    12.00  5.73      23.11  Present     49    25.30    97.20   52  Si
## 1  144     0.01  4.41      28.61   Absent     55    28.87     2.06   63  Si
## 2  118     0.08  3.48      32.28  Present     52    29.14     3.81   46  No
## 3  170     7.50  6.41      38.03  Present     51    31.99    24.26   58  Si
## 4  134    13.60  3.50      27.78  Present     60    25.99    57.34   49  Si
datos["famhist"] = datos["famhist"].cat.codes
print(datos.info())
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 462 entries, 0 to 461
## Data columns (total 10 columns):
## sbp          462 non-null int64
## tobacco      462 non-null float64
## ldl          462 non-null float64
## adiposity    462 non-null float64
## famhist      462 non-null int8
## typea        462 non-null int64
## obesity      462 non-null float64
## alcohol      462 non-null float64
## age          462 non-null int64
## chd          462 non-null object
## dtypes: float64(5), int64(3), int8(1), object(1)
## memory usage: 33.0+ KB
## None
print(datos.head())
# Convierte las variables de entero a categórica
##    sbp  tobacco   ldl  adiposity  famhist  typea  obesity  alcohol  age chd
## 0  160    12.00  5.73      23.11        1     49    25.30    97.20   52  Si
## 1  144     0.01  4.41      28.61        0     55    28.87     2.06   63  Si
## 2  118     0.08  3.48      32.28        1     52    29.14     3.81   46  No
## 3  170     7.50  6.41      38.03        1     51    31.99    24.26   58  Si
## 4  134    13.60  3.50      27.78        1     60    25.99    57.34   49  Si
datos['famhist'] = datos['famhist'].astype('category')
print(datos.info())
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 462 entries, 0 to 461
## Data columns (total 10 columns):
## sbp          462 non-null int64
## tobacco      462 non-null float64
## ldl          462 non-null float64
## adiposity    462 non-null float64
## famhist      462 non-null category
## typea        462 non-null int64
## obesity      462 non-null float64
## alcohol      462 non-null float64
## age          462 non-null int64
## chd          462 non-null object
## dtypes: category(1), float64(5), int64(3), object(1)
## memory usage: 33.1+ KB
## None
print(datos.head())
##    sbp  tobacco   ldl  adiposity famhist  typea  obesity  alcohol  age chd
## 0  160    12.00  5.73      23.11       1     49    25.30    97.20   52  Si
## 1  144     0.01  4.41      28.61       0     55    28.87     2.06   63  Si
## 2  118     0.08  3.48      32.28       1     52    29.14     3.81   46  No
## 3  170     7.50  6.41      38.03       1     51    31.99    24.26   58  Si
## 4  134    13.60  3.50      27.78       1     60    25.99    57.34   49  Si
X = datos.iloc[:,:9] 
print(X.head())
##    sbp  tobacco   ldl  adiposity famhist  typea  obesity  alcohol  age
## 0  160    12.00  5.73      23.11       1     49    25.30    97.20   52
## 1  144     0.01  4.41      28.61       0     55    28.87     2.06   63
## 2  118     0.08  3.48      32.28       1     52    29.14     3.81   46
## 3  170     7.50  6.41      38.03       1     51    31.99    24.26   58
## 4  134    13.60  3.50      27.78       1     60    25.99    57.34   49
y = datos.iloc[:,9:10] 
print(y.head())
##   chd
## 0  Si
## 1  Si
## 2  No
## 3  Si
## 4  Si
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, random_state=0)
## /anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.
##   FutureWarning)
instancia_arbol = DecisionTreeClassifier(max_depth=4,criterion="gini")
print(instancia_arbol)
## DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
##             max_features=None, max_leaf_nodes=None,
##             min_impurity_decrease=0.0, min_impurity_split=None,
##             min_samples_leaf=1, min_samples_split=2,
##             min_weight_fraction_leaf=0.0, presort=False, random_state=None,
##             splitter='best')
instancia_arbol.fit(X_train,y_train)
prediccion = instancia_arbol.predict(X_test)
MC = confusion_matrix(y_test, prediccion)
indices = indices_general(MC,list(np.unique(y)))
for k in indices:
    print("\n%s:\n%s"%(k,str(indices[k])))
## 
## Matriz de Confusión:
## [[53  8]
##  [25  7]]
## 
## Precisión Global:
## 0.6451612903225806
## 
## Error Global:
## 0.3548387096774194
## 
## Precisión por Categoría:
##          No       Si
## 0  0.868852  0.21875
dot_data = export_graphviz(instancia_arbol, out_file=None,class_names=["No", "Si"],
                feature_names=list(X.columns.values), filled=True)
grafico = graphviz.Source(dot_data)
graficar_arbol(grafico)